In [4]:
import pandas as pd
import seaborn as sns
In [5]:
df = pd.read_csv("AB_NYC_2019.csv")

1. Dimension of data?¶

In [6]:
df.shape
Out[6]:
(48906, 16)

2. How does the data look like?¶

In [7]:
df.head()
Out[7]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 19-10-2018 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 21-05-2019 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 05-07-2019 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 19-11-2018 0.10 1 0
In [8]:
df.tail()
Out[8]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
48901 5441 Central Manhattan/near Broadway 7989 Kate Manhattan Hell's Kitchen 40.76076 -73.98867 Private room 85 2 188 23-06-2019 1.50 1 39
48902 5803 Lovely Room 1, Garden, Best Area, Legal rental 9744 Laurie Brooklyn South Slope 40.66829 -73.98779 Private room 89 4 167 24-06-2019 1.34 3 314
48903 6021 Wonderful Guest Bedroom in Manhattan for SINGLES 11528 Claudio Manhattan Upper West Side 40.79826 -73.96113 Private room 85 2 113 05-07-2019 0.91 1 333
48904 6090 West Village Nest - Superhost 11975 Alina Manhattan West Village 40.73530 -74.00525 Entire home/apt 120 90 27 31-10-2018 0.22 1 0
48905 6848 Only 2 stops to Manhattan studio 15991 Allen & Irina Brooklyn Williamsburg 40.70837 -73.95352 Entire home/apt 140 2 148 29-06-2019 1.20 1 46
In [9]:
df.sample(5)
Out[9]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
22052 17741347 Cozy studio w/kitchen & bathroom. Great location 3435092 Elena Manhattan Upper East Side 40.78230 -73.94838 Entire home/apt 105 5 117 23-06-2019 4.29 3 187
46714 35397690 Cozy 3 bedroom apt / Lower East Side 251918661 Alex Manhattan Chinatown 40.71678 -73.99492 Entire home/apt 400 4 3 13-06-2019 2.73 1 57
23701 19161510 Affordable and functional room 3788839 Lorenzo & Alex Brooklyn Bedford-Stuyvesant 40.68468 -73.92408 Private room 45 28 10 27-05-2019 0.46 4 310
7374 5445314 Beautiful Artist loft 21838149 Marcia Brooklyn DUMBO 40.70299 -73.98624 Private room 160 1 54 20-06-2019 1.03 1 10
26952 21354840 BRIGHT & CLEAN UNION SQ GEM APT! 700 SQF ALL Y... 18270371 Ena Manhattan Gramercy 40.73305 -73.98574 Entire home/apt 148 4 10 27-09-2018 0.52 1 0

3. What is the datatype of cols?¶

In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48906 entries, 0 to 48905
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48906 non-null  int64  
 1   name                            48890 non-null  object 
 2   host_id                         48906 non-null  int64  
 3   host_name                       48885 non-null  object 
 4   neighbourhood_group             48906 non-null  object 
 5   neighbourhood                   48906 non-null  object 
 6   latitude                        48906 non-null  float64
 7   longitude                       48906 non-null  float64
 8   room_type                       48906 non-null  object 
 9   price                           48906 non-null  int64  
 10  minimum_nights                  48906 non-null  int64  
 11  number_of_reviews               48906 non-null  int64  
 12  last_review                     38854 non-null  object 
 13  reviews_per_month               38854 non-null  float64
 14  calculated_host_listings_count  48906 non-null  int64  
 15  availability_365                48906 non-null  int64  
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB

4. Are there any missing values?¶

In [11]:
df.isna().sum()
Out[11]:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

5. How does the data look like mathematically?¶

In [12]:
df.describe()
Out[12]:
id host_id latitude longitude price minimum_nights number_of_reviews reviews_per_month calculated_host_listings_count availability_365
count 4.890600e+04 4.890600e+04 48906.000000 48906.000000 48906.000000 48906.000000 48906.000000 38854.000000 48906.000000 48906.000000
mean 1.901287e+07 6.760480e+07 40.728952 -73.952175 152.711324 7.031612 23.300454 1.373151 7.142702 112.782031
std 1.098557e+07 7.860866e+07 0.054529 0.046154 240.128713 20.512489 44.607175 1.680270 32.948926 131.620370
min 2.539000e+03 2.438000e+03 40.499790 -74.244420 0.000000 1.000000 0.000000 0.010000 1.000000 0.000000
25% 9.464662e+06 7.809567e+06 40.690100 -73.983080 69.000000 1.000000 1.000000 0.190000 1.000000 0.000000
50% 1.967545e+07 3.078463e+07 40.723080 -73.955685 106.000000 3.000000 5.000000 0.720000 1.000000 45.000000
75% 2.915085e+07 1.074344e+08 40.763120 -73.936283 175.000000 5.000000 24.000000 2.020000 2.000000 227.000000
max 3.648724e+07 2.743213e+08 40.913060 -73.712990 10000.000000 1250.000000 629.000000 58.500000 327.000000 365.000000

6. Are there any duplicate values?¶

In [13]:
df.duplicated().sum()
Out[13]:
11

7. How many unique values are there in each column?¶

In [14]:
df.nunique()
Out[14]:
id                                48895
name                              47896
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64
In [15]:
df["neighbourhood_group"].unique()
Out[15]:
array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

8. Are there any outliers in the numerical columns?¶

In [16]:
sns.boxplot(df["price"])
Out[16]:
<Axes: ylabel='price'>
No description has been provided for this image
In [17]:
sns.boxplot(df["availability_365"])
Out[17]:
<Axes: ylabel='availability_365'>
No description has been provided for this image